high_population_data <- read.csv("high_popularity_spotify_data.csv")
low_population_data <- read.csv("low_popularity_spotify_data.csv")
spotify_data <- rbind(high_population_data, low_population_data)
head(spotify_data)
## energy tempo danceability playlist_genre loudness liveness valence
## 1 0.592 157.969 0.521 pop -7.777 0.1220 0.535
## 2 0.507 104.978 0.747 pop -10.171 0.1170 0.438
## 3 0.808 108.548 0.554 pop -4.169 0.1590 0.372
## 4 0.910 112.966 0.670 pop -4.070 0.3040 0.786
## 5 0.783 149.027 0.777 pop -4.477 0.3550 0.939
## 6 0.582 116.712 0.700 pop -5.960 0.0881 0.785
## track_artist time_signature speechiness track_popularity
## 1 Lady Gaga, Bruno Mars 3 0.0304 100
## 2 Billie Eilish 4 0.0358 97
## 3 Gracie Abrams 4 0.0368 93
## 4 Sabrina Carpenter 4 0.0634 81
## 5 ROSÉ, Bruno Mars 4 0.2600 98
## 6 Chappell Roan 4 0.0356 94
## track_href
## 1 https://api.spotify.com/v1/tracks/2plbrEY59IikOBgBGLjaoe
## 2 https://api.spotify.com/v1/tracks/6dOtVTDdiauQNBQEDOtlAB
## 3 https://api.spotify.com/v1/tracks/7ne4VBA60CxGM75vw0EYad
## 4 https://api.spotify.com/v1/tracks/1d7Ptw3qYcfpdLNL5REhtJ
## 5 https://api.spotify.com/v1/tracks/5vNRhkKd0yEAg8suGBpjeY
## 6 https://api.spotify.com/v1/tracks/0WbMK4wrZ1wFSty9F7FCgu
## uri track_album_name
## 1 spotify:track:2plbrEY59IikOBgBGLjaoe Die With A Smile
## 2 spotify:track:6dOtVTDdiauQNBQEDOtlAB HIT ME HARD AND SOFT
## 3 spotify:track:7ne4VBA60CxGM75vw0EYad The Secret of Us (Deluxe)
## 4 spotify:track:1d7Ptw3qYcfpdLNL5REhtJ Short n' Sweet
## 5 spotify:track:5vNRhkKd0yEAg8suGBpjeY APT.
## 6 spotify:track:0WbMK4wrZ1wFSty9F7FCgu Good Luck, Babe!
## playlist_name
## 1 Today's Top Hits
## 2 Today's Top Hits
## 3 Today's Top Hits
## 4 Today's Top Hits
## 5 Today's Top Hits
## 6 Today's Top Hits
## analysis_url
## 1 https://api.spotify.com/v1/audio-analysis/2plbrEY59IikOBgBGLjaoe
## 2 https://api.spotify.com/v1/audio-analysis/6dOtVTDdiauQNBQEDOtlAB
## 3 https://api.spotify.com/v1/audio-analysis/7ne4VBA60CxGM75vw0EYad
## 4 https://api.spotify.com/v1/audio-analysis/1d7Ptw3qYcfpdLNL5REhtJ
## 5 https://api.spotify.com/v1/audio-analysis/5vNRhkKd0yEAg8suGBpjeY
## 6 https://api.spotify.com/v1/audio-analysis/0WbMK4wrZ1wFSty9F7FCgu
## track_id track_name track_album_release_date
## 1 2plbrEY59IikOBgBGLjaoe Die With A Smile 2024-08-16
## 2 6dOtVTDdiauQNBQEDOtlAB BIRDS OF A FEATHER 2024-05-17
## 3 7ne4VBA60CxGM75vw0EYad That’s So True 2024-10-18
## 4 1d7Ptw3qYcfpdLNL5REhtJ Taste 2024-08-23
## 5 5vNRhkKd0yEAg8suGBpjeY APT. 2024-10-18
## 6 0WbMK4wrZ1wFSty9F7FCgu Good Luck, Babe! 2024-04-05
## instrumentalness track_album_id mode key duration_ms acousticness
## 1 0.0000 10FLjwfpbxLmW8c25Xyc2N 0 6 251668 0.3080
## 2 0.0608 7aJuG4TFXa2hmE4z1yxc3n 1 2 210373 0.2000
## 3 0.0000 0hBRqPYPXhr1RkTDG3n4Mk 1 1 166300 0.2140
## 4 0.0000 4B4Elma4nNDUyl6D5PvQkj 0 0 157280 0.0939
## 5 0.0000 2IYQwwgxgOIn7t3iF6ufFD 0 0 169917 0.0283
## 6 0.0000 1WAjjRMfZjEXtB0lQrAw6Q 0 11 218424 0.0502
## id playlist_subgenre type
## 1 2plbrEY59IikOBgBGLjaoe mainstream audio_features
## 2 6dOtVTDdiauQNBQEDOtlAB mainstream audio_features
## 3 7ne4VBA60CxGM75vw0EYad mainstream audio_features
## 4 1d7Ptw3qYcfpdLNL5REhtJ mainstream audio_features
## 5 5vNRhkKd0yEAg8suGBpjeY mainstream audio_features
## 6 0WbMK4wrZ1wFSty9F7FCgu mainstream audio_features
## playlist_id
## 1 37i9dQZF1DXcBWIGoYBM5M
## 2 37i9dQZF1DXcBWIGoYBM5M
## 3 37i9dQZF1DXcBWIGoYBM5M
## 4 37i9dQZF1DXcBWIGoYBM5M
## 5 37i9dQZF1DXcBWIGoYBM5M
## 6 37i9dQZF1DXcBWIGoYBM5M
View(spotify_data)
#3
str(spotify_data)
## 'data.frame': 4831 obs. of 29 variables:
## $ energy : num 0.592 0.507 0.808 0.91 0.783 0.582 0.561 0.247 0.416 0.722 ...
## $ tempo : num 158 105 109 113 149 ...
## $ danceability : num 0.521 0.747 0.554 0.67 0.777 0.7 0.669 0.467 0.492 0.769 ...
## $ playlist_genre : chr "pop" "pop" "pop" "pop" ...
## $ loudness : num -7.78 -10.17 -4.17 -4.07 -4.48 ...
## $ liveness : num 0.122 0.117 0.159 0.304 0.355 0.0881 0.0954 0.17 0.203 0.111 ...
## $ valence : num 0.535 0.438 0.372 0.786 0.939 0.785 0.841 0.126 0.297 0.57 ...
## $ track_artist : chr "Lady Gaga, Bruno Mars" "Billie Eilish" "Gracie Abrams" "Sabrina Carpenter" ...
## $ time_signature : int 3 4 4 4 4 4 4 4 4 4 ...
## $ speechiness : num 0.0304 0.0358 0.0368 0.0634 0.26 0.0356 0.0411 0.0431 0.0254 0.0507 ...
## $ track_popularity : int 100 97 93 81 98 94 88 93 71 92 ...
## $ track_href : chr "https://api.spotify.com/v1/tracks/2plbrEY59IikOBgBGLjaoe" "https://api.spotify.com/v1/tracks/6dOtVTDdiauQNBQEDOtlAB" "https://api.spotify.com/v1/tracks/7ne4VBA60CxGM75vw0EYad" "https://api.spotify.com/v1/tracks/1d7Ptw3qYcfpdLNL5REhtJ" ...
## $ uri : chr "spotify:track:2plbrEY59IikOBgBGLjaoe" "spotify:track:6dOtVTDdiauQNBQEDOtlAB" "spotify:track:7ne4VBA60CxGM75vw0EYad" "spotify:track:1d7Ptw3qYcfpdLNL5REhtJ" ...
## $ track_album_name : chr "Die With A Smile" "HIT ME HARD AND SOFT" "The Secret of Us (Deluxe)" "Short n' Sweet" ...
## $ playlist_name : chr "Today's Top Hits" "Today's Top Hits" "Today's Top Hits" "Today's Top Hits" ...
## $ analysis_url : chr "https://api.spotify.com/v1/audio-analysis/2plbrEY59IikOBgBGLjaoe" "https://api.spotify.com/v1/audio-analysis/6dOtVTDdiauQNBQEDOtlAB" "https://api.spotify.com/v1/audio-analysis/7ne4VBA60CxGM75vw0EYad" "https://api.spotify.com/v1/audio-analysis/1d7Ptw3qYcfpdLNL5REhtJ" ...
## $ track_id : chr "2plbrEY59IikOBgBGLjaoe" "6dOtVTDdiauQNBQEDOtlAB" "7ne4VBA60CxGM75vw0EYad" "1d7Ptw3qYcfpdLNL5REhtJ" ...
## $ track_name : chr "Die With A Smile" "BIRDS OF A FEATHER" "That’s So True" "Taste" ...
## $ track_album_release_date: chr "2024-08-16" "2024-05-17" "2024-10-18" "2024-08-23" ...
## $ instrumentalness : num 0.00 6.08e-02 0.00 0.00 0.00 0.00 9.62e-03 2.71e-04 8.61e-05 2.56e-06 ...
## $ track_album_id : chr "10FLjwfpbxLmW8c25Xyc2N" "7aJuG4TFXa2hmE4z1yxc3n" "0hBRqPYPXhr1RkTDG3n4Mk" "4B4Elma4nNDUyl6D5PvQkj" ...
## $ mode : int 0 1 1 0 0 0 1 0 1 0 ...
## $ key : int 6 2 1 0 0 11 10 6 11 11 ...
## $ duration_ms : int 251668 210373 166300 157280 169917 218424 169698 261467 211979 256000 ...
## $ acousticness : num 0.308 0.2 0.214 0.0939 0.0283 0.0502 0.495 0.612 0.686 0.0584 ...
## $ id : chr "2plbrEY59IikOBgBGLjaoe" "6dOtVTDdiauQNBQEDOtlAB" "7ne4VBA60CxGM75vw0EYad" "1d7Ptw3qYcfpdLNL5REhtJ" ...
## $ playlist_subgenre : chr "mainstream" "mainstream" "mainstream" "mainstream" ...
## $ type : chr "audio_features" "audio_features" "audio_features" "audio_features" ...
## $ playlist_id : chr "37i9dQZF1DXcBWIGoYBM5M" "37i9dQZF1DXcBWIGoYBM5M" "37i9dQZF1DXcBWIGoYBM5M" "37i9dQZF1DXcBWIGoYBM5M" ...
#29 coloumns, 4831 rows
#4
summary(spotify_data)
## energy tempo danceability playlist_genre
## Min. :0.000202 Min. : 48.23 Min. :0.0589 Length:4831
## 1st Qu.:0.442250 1st Qu.: 96.06 1st Qu.:0.5250 Class :character
## Median :0.633000 Median :118.06 Median :0.6530 Mode :character
## Mean :0.586691 Mean :118.27 Mean :0.6223
## 3rd Qu.:0.777000 3rd Qu.:136.72 3rd Qu.:0.7580
## Max. :0.998000 Max. :241.43 Max. :0.9790
## NA's :1 NA's :1 NA's :1
## loudness liveness valence track_artist
## Min. :-48.069 Min. :0.0210 Min. :0.0296 Length:4831
## 1st Qu.:-10.298 1st Qu.:0.0954 1st Qu.:0.2750 Class :character
## Median : -7.191 Median :0.1180 Median :0.4830 Mode :character
## Mean : -9.282 Mean :0.1676 Mean :0.4819
## 3rd Qu.: -5.337 3rd Qu.:0.1950 3rd Qu.:0.6900
## Max. : 1.318 Max. :0.9790 Max. :0.9870
## NA's :1 NA's :1 NA's :1
## time_signature speechiness track_popularity track_href
## Min. :1.000 Min. :0.0219 Min. : 11.00 Length:4831
## 1st Qu.:4.000 1st Qu.:0.0386 1st Qu.: 41.00 Class :character
## Median :4.000 Median :0.0561 Median : 56.00 Mode :character
## Mean :3.937 Mean :0.1017 Mean : 54.76
## 3rd Qu.:4.000 3rd Qu.:0.1180 3rd Qu.: 72.00
## Max. :5.000 Max. :0.9270 Max. :100.00
## NA's :1 NA's :1
## uri track_album_name playlist_name analysis_url
## Length:4831 Length:4831 Length:4831 Length:4831
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## track_id track_name track_album_release_date
## Length:4831 Length:4831 Length:4831
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## instrumentalness track_album_id mode key
## Min. :0.0000000 Length:4831 Min. :0.0000 Min. : 0.000
## 1st Qu.:0.0000000 Class :character 1st Qu.:0.0000 1st Qu.: 2.000
## Median :0.0000913 Mode :character Median :1.0000 Median : 5.000
## Mean :0.2010526 Mean :0.5621 Mean : 5.233
## 3rd Qu.:0.2005000 3rd Qu.:1.0000 3rd Qu.: 8.000
## Max. :0.9910000 Max. :1.0000 Max. :11.000
## NA's :1 NA's :1 NA's :1
## duration_ms acousticness id playlist_subgenre
## Min. : 35375 Min. :0.0000036 Length:4831 Length:4831
## 1st Qu.: 159000 1st Qu.:0.0529250 Class :character Class :character
## Median : 194866 Median :0.2245000 Mode :character Mode :character
## Mean : 206151 Mean :0.3412170
## 3rd Qu.: 233478 3rd Qu.:0.5900000
## Max. :1355260 Max. :0.9960000
## NA's :1 NA's :1
## type playlist_id
## Length:4831 Length:4831
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
https://www.kaggle.com/datasets/solomonameh/spotify-music-dataset
The dataset is spotify songs and their respective attributes given by spotify.
Each observation is a song along with its data.
Energy - A measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy.
Tempo - The speed of a track, measured in beats per minute (BPM).
Danceability - A score describing how suitable a track is for dancing based on tempo, rhythm stability, beat strength and overall regularity.
Loudness - The overall loudness of a track in decibels (dB). Higher values indicate louder tracks overall.
Liveness - The likelihood of a track being performed live. Higher values suggest more audience presence.
Valence - The overall musical positiveness(emotion) of a track. High valence sounds happy; low valence sounds sad or angry.
Speechiness - Measures the presence of spoken words.
Instrumentalness - The likelihood a track contains no vocals. Values closer to 1.0 suggest solely instrumental tracks.
Mode - Indicates the modality of the track.
Key - The musical key, represented as an integer from 0 to 11, mapping to standard Pitch class notation.
Duration_ms - The length of the track in milliseconds.
Acousticness - A confidence measure of whether a track is acoustic(1) or not(0).
Track Name - The name of the track.
Track Artist - The artist(s) performing the track.
Track Album - Name The album in which the track is featured.
Track Album Release Date - The release date of the album containing the track.
Track ID - A unique identifier assigned to the track by Spotify.
Track Album ID - A unique identifier for the album.
Playlist Name - The name of the playlist where the track is included.
Playlist Genre - The main genre associated with the playlist (e.g., pop, rock, classical).
Playlist Subgenre - A more specific subgenre tied to the playlist (e.g., indie pop, punk rock).
Playlist ID - A unique identifier for the playlist.
Track Popularity - A score (0–100) which is calculated based on total number of streams in relation to other songs.
4831 Rows
30 Columns
Variables like Energy, danceability and other variables generated by spotify and not taken directly from the song are on a scale from 0-1
Variables like Loudness start at around 0dB as the loudest and go into negative signifying it is X decibels quieter than normal output.
Tempo is on a scale of beats per minute.
Duration_ms is the length of a song in milliseconds.
high_population_data <- read.csv("high_popularity_spotify_data.csv")
low_population_data <- read.csv("low_popularity_spotify_data.csv")
spotify_data <- rbind(high_population_data, low_population_data)
write.csv(spotify_data, "spotify_data.csv", row.names = FALSE)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(purrr)
library(ggcorrplot)
head(spotify_data)
## energy tempo danceability playlist_genre loudness liveness valence
## 1 0.592 157.969 0.521 pop -7.777 0.1220 0.535
## 2 0.507 104.978 0.747 pop -10.171 0.1170 0.438
## 3 0.808 108.548 0.554 pop -4.169 0.1590 0.372
## 4 0.910 112.966 0.670 pop -4.070 0.3040 0.786
## 5 0.783 149.027 0.777 pop -4.477 0.3550 0.939
## 6 0.582 116.712 0.700 pop -5.960 0.0881 0.785
## track_artist time_signature speechiness track_popularity
## 1 Lady Gaga, Bruno Mars 3 0.0304 100
## 2 Billie Eilish 4 0.0358 97
## 3 Gracie Abrams 4 0.0368 93
## 4 Sabrina Carpenter 4 0.0634 81
## 5 ROSÉ, Bruno Mars 4 0.2600 98
## 6 Chappell Roan 4 0.0356 94
## track_href
## 1 https://api.spotify.com/v1/tracks/2plbrEY59IikOBgBGLjaoe
## 2 https://api.spotify.com/v1/tracks/6dOtVTDdiauQNBQEDOtlAB
## 3 https://api.spotify.com/v1/tracks/7ne4VBA60CxGM75vw0EYad
## 4 https://api.spotify.com/v1/tracks/1d7Ptw3qYcfpdLNL5REhtJ
## 5 https://api.spotify.com/v1/tracks/5vNRhkKd0yEAg8suGBpjeY
## 6 https://api.spotify.com/v1/tracks/0WbMK4wrZ1wFSty9F7FCgu
## uri track_album_name
## 1 spotify:track:2plbrEY59IikOBgBGLjaoe Die With A Smile
## 2 spotify:track:6dOtVTDdiauQNBQEDOtlAB HIT ME HARD AND SOFT
## 3 spotify:track:7ne4VBA60CxGM75vw0EYad The Secret of Us (Deluxe)
## 4 spotify:track:1d7Ptw3qYcfpdLNL5REhtJ Short n' Sweet
## 5 spotify:track:5vNRhkKd0yEAg8suGBpjeY APT.
## 6 spotify:track:0WbMK4wrZ1wFSty9F7FCgu Good Luck, Babe!
## playlist_name
## 1 Today's Top Hits
## 2 Today's Top Hits
## 3 Today's Top Hits
## 4 Today's Top Hits
## 5 Today's Top Hits
## 6 Today's Top Hits
## analysis_url
## 1 https://api.spotify.com/v1/audio-analysis/2plbrEY59IikOBgBGLjaoe
## 2 https://api.spotify.com/v1/audio-analysis/6dOtVTDdiauQNBQEDOtlAB
## 3 https://api.spotify.com/v1/audio-analysis/7ne4VBA60CxGM75vw0EYad
## 4 https://api.spotify.com/v1/audio-analysis/1d7Ptw3qYcfpdLNL5REhtJ
## 5 https://api.spotify.com/v1/audio-analysis/5vNRhkKd0yEAg8suGBpjeY
## 6 https://api.spotify.com/v1/audio-analysis/0WbMK4wrZ1wFSty9F7FCgu
## track_id track_name track_album_release_date
## 1 2plbrEY59IikOBgBGLjaoe Die With A Smile 2024-08-16
## 2 6dOtVTDdiauQNBQEDOtlAB BIRDS OF A FEATHER 2024-05-17
## 3 7ne4VBA60CxGM75vw0EYad That’s So True 2024-10-18
## 4 1d7Ptw3qYcfpdLNL5REhtJ Taste 2024-08-23
## 5 5vNRhkKd0yEAg8suGBpjeY APT. 2024-10-18
## 6 0WbMK4wrZ1wFSty9F7FCgu Good Luck, Babe! 2024-04-05
## instrumentalness track_album_id mode key duration_ms acousticness
## 1 0.0000 10FLjwfpbxLmW8c25Xyc2N 0 6 251668 0.3080
## 2 0.0608 7aJuG4TFXa2hmE4z1yxc3n 1 2 210373 0.2000
## 3 0.0000 0hBRqPYPXhr1RkTDG3n4Mk 1 1 166300 0.2140
## 4 0.0000 4B4Elma4nNDUyl6D5PvQkj 0 0 157280 0.0939
## 5 0.0000 2IYQwwgxgOIn7t3iF6ufFD 0 0 169917 0.0283
## 6 0.0000 1WAjjRMfZjEXtB0lQrAw6Q 0 11 218424 0.0502
## id playlist_subgenre type
## 1 2plbrEY59IikOBgBGLjaoe mainstream audio_features
## 2 6dOtVTDdiauQNBQEDOtlAB mainstream audio_features
## 3 7ne4VBA60CxGM75vw0EYad mainstream audio_features
## 4 1d7Ptw3qYcfpdLNL5REhtJ mainstream audio_features
## 5 5vNRhkKd0yEAg8suGBpjeY mainstream audio_features
## 6 0WbMK4wrZ1wFSty9F7FCgu mainstream audio_features
## playlist_id
## 1 37i9dQZF1DXcBWIGoYBM5M
## 2 37i9dQZF1DXcBWIGoYBM5M
## 3 37i9dQZF1DXcBWIGoYBM5M
## 4 37i9dQZF1DXcBWIGoYBM5M
## 5 37i9dQZF1DXcBWIGoYBM5M
## 6 37i9dQZF1DXcBWIGoYBM5M
#View(spotify_data)
# The first plot I made, it felt too hard to read as 4800 points on a graph with so many colored genres wasn't readable, and was unused.
plot <- ggplot(spotify_data, aes(x = energy, y = tempo, color = playlist_genre)) +
geom_point()
show(plot)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
boxplot <- ggplot(spotify_data, aes(x = playlist_genre, y = track_popularity)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # hard to read otherwise
show(boxplot)
#added for readability of the next plot.
options(repr.plot.width = 100, repr.plot.height = 6)
boxplot <- ggplot(spotify_data, aes(x = playlist_genre, y = track_popularity, fill = playlist_genre)) +
geom_boxplot(outlier.shape = 21, outlier.fill = "red", outlier.size = 2, alpha = 0.7) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 12),
axis.title = element_text(face = "bold"),
legend.position = "none"
) +
labs(
x = "Playlist Genre",
y = "Track Popularity"
)
show(boxplot)
ggsave("boxplot_wide.png", plot = boxplot, width = 13.5, height = 6, dpi = 300)
# Filter data for the two genres
electronic_popularity <- spotify_data$track_popularity[spotify_data$playlist_genre == "electronic"]
lofi_popularity <- spotify_data$track_popularity[spotify_data$playlist_genre == "lofi"]
# Perform Welch's t-test (unequal variances)
t_test_result <- t.test(electronic_popularity, lofi_popularity, alternative = "greater")
# Print results
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: electronic_popularity and lofi_popularity
## t = 6.8614, df = 861.18, p-value = 6.513e-12
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 4.489476 Inf
## sample estimates:
## mean of x mean of y
## 52.93718 47.03010
# Visualize correlation using a heatmap
ggcorrplot(cor_matrix, method = "circle", type = "lower", lab = TRUE, lab_size = 2.5)
average_features <- group_by(spotify_data, playlist_genre) %>%
summarise(
avg_energy = mean(energy, na.rm = TRUE),
avg_tempo = mean(tempo, na.rm = TRUE),
avg_danceability = mean(danceability, na.rm = TRUE),
avg_loudness = mean(loudness, na.rm = TRUE),
avg_liveness = mean(liveness, na.rm = TRUE),
avg_valence = mean(valence, na.rm = TRUE),
avg_speechiness = mean(speechiness, na.rm = TRUE),
avg_instrumentalness = mean(instrumentalness, na.rm = TRUE),
avg_duration_ms = mean(duration_ms, na.rm = TRUE),
avg_acousticness = mean(acousticness, na.rm = TRUE)
)
str(average_features)
## tibble [35 × 11] (S3: tbl_df/tbl/data.frame)
## $ playlist_genre : chr [1:35] "afrobeats" "ambient" "arabic" "blues" ...
## $ avg_energy : num [1:35] 0.691 0.465 0.637 0.505 0.689 ...
## $ avg_tempo : num [1:35] 118 110 117 111 118 ...
## $ avg_danceability : num [1:35] 0.728 0.549 0.704 0.604 0.728 ...
## $ avg_loudness : num [1:35] -7.97 -13.72 -7.49 -8.8 -6.56 ...
## $ avg_liveness : num [1:35] 0.148 0.156 0.155 0.157 0.154 ...
## $ avg_valence : num [1:35] 0.608 0.431 0.539 0.424 0.742 ...
## $ avg_speechiness : num [1:35] 0.1123 0.1346 0.1677 0.0778 0.1242 ...
## $ avg_instrumentalness: num [1:35] 0.1275 0.2052 0.0915 0.0121 0.0566 ...
## $ avg_duration_ms : num [1:35] 264409 185782 172701 255396 193319 ...
## $ avg_acousticness : num [1:35] 0.189 0.49 0.321 0.418 0.346 ...
average_features_long <- average_features %>%
pivot_longer(cols = -playlist_genre, names_to = "feature", values_to = "value")
str(average_features_long)
## tibble [350 × 3] (S3: tbl_df/tbl/data.frame)
## $ playlist_genre: chr [1:350] "afrobeats" "afrobeats" "afrobeats" "afrobeats" ...
## $ feature : chr [1:350] "avg_energy" "avg_tempo" "avg_danceability" "avg_loudness" ...
## $ value : num [1:350] 0.691 118.186 0.728 -7.973 0.148 ...
plot_feature <- function(feature_name) {
ggplot(average_features_long %>% filter(feature == feature_name),
aes(x = playlist_genre, y = value, fill = playlist_genre)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(
title = paste("Average", gsub("avg_", "", feature_name), "by Genre"),
x = "Playlist Genre",
y = "Average Value"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 10, color = "black"))
}
feature_names <- unique(average_features_long$feature)
for (feature in feature_names) {
print(plot_feature(feature))
}
# Filter the data for Billie Eilish and Bruno Mars
filtered_data <- spotify_data %>%
filter(track_artist %in% c("Billie Eilish", "Bruno Mars"))
# Create the skeletal box plot (without outliers)
ggplot(filtered_data, aes(x = track_artist, y = energy, fill = track_artist)) +
geom_boxplot(outlier.shape = NA) + # Remove outliers to keep it skeletal
labs(title = "Comparison of Energy Levels: Billie Eilish vs. Bruno Mars",
x = "Artist",
y = "Energy Level") +
scale_fill_manual(values = c("Billie Eilish" = "green", "Bruno Mars" = "orange")) +
theme_minimal()
ggplot(filtered_data, aes(x = track_artist, y = track_popularity, fill = track_artist)) +
geom_boxplot(outlier.shape = NA) + # Remove outliers to keep it skeletal
labs(title = "Comparison of Track Popularity: Billie Eilish vs. Bruno Mars",
x = "Track Popularity") +
scale_fill_manual(values = c("Billie Eilish" = "green", "Bruno Mars" = "orange")) +
theme_minimal()

lm_model <- lm(track_popularity ~ instrumentalness, data = spotify_data)
# Display summary of the regression model
summary(lm_model)
##
## Call:
## lm(formula = track_popularity ~ instrumentalness, data = spotify_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.743 -12.811 3.257 14.755 42.257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.7434 0.3171 182.11 <2e-16 ***
## instrumentalness -14.8283 0.7824 -18.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.13 on 4828 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.06925, Adjusted R-squared: 0.06905
## F-statistic: 359.2 on 1 and 4828 DF, p-value: < 2.2e-16
# Visualize the regression with a scatter plot
ggplot(spotify_data, aes(x = instrumentalness, y = track_popularity)) +
geom_point(alpha = 0.5, color = "blue") + # Scatter plot points
geom_smooth(method = "lm", color = "red", se = TRUE) + # Regression line with confidence interval
theme_minimal(base_size = 14) +
labs(
title = "Linear Regression: Instrumentalness vs. Track Popularity",
x = "Instrumentalness",
y = "Track Popularity"
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
lm_model1 <- lm(acousticness ~ instrumentalness, data = spotify_data)
# Display summary of the regression model
summary(lm_model1)
##
## Call:
## lm(formula = acousticness ~ instrumentalness, data = spotify_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71205 -0.21482 -0.06111 0.21744 0.74773
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.246112 0.004634 53.12 <2e-16 ***
## instrumentalness 0.473034 0.011433 41.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2796 on 4828 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2617, Adjusted R-squared: 0.2616
## F-statistic: 1712 on 1 and 4828 DF, p-value: < 2.2e-16
# Visualize the regression with a scatter plot
ggplot(spotify_data, aes(x = instrumentalness, y = acousticness)) +
geom_point(alpha = 0.5, color = "blue") + # Scatter plot points
geom_smooth(method = "lm", color = "red", se = TRUE) + # Regression line with confidence interval
theme_minimal(base_size = 14) +
labs(
title = "Linear Regression: Instrumentalness vs. Acousticness",
x = "Instrumentalness",
y = "Acousticness"
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
spotify_data <- spotify_data %>%
mutate(energy_level = ifelse(energy > 0.7, "High", "Low"))
high_energy_tracks <- spotify_data %>%
filter(energy > 0.7)
ggplot(high_energy_tracks, aes(x = tempo, y = danceability, color = playlist_genre)) +
geom_point(alpha = 0.6) +
labs(title = "Danceability vs. Tempo for High-Energy Tracks",
x = "Tempo (BPM)",
y = "Danceability",
color = "Playlist Genre") +
theme_minimal()
#What songs are the most/least of the values?
high_population_data <- read.csv("high_popularity_spotify_data.csv")
low_population_data <- read.csv("low_popularity_spotify_data.csv")
spotify_data <- rbind(high_population_data, low_population_data)
# For each feature, find the song with the highest and lowest value
find_extremes <- function(feature) {
# Find the row with the maximum value
max_row <- spotify_data[which.max(spotify_data[[feature]]), ]
# Find the row with the minimum value
min_row <- spotify_data[which.min(spotify_data[[feature]]), ]
# Create a data frame with results
result <- data.frame(
feature = feature,
highest_value_song = max_row$track_name,
highest_value_artist = max_row$track_artist,
highest_value = max_row[[feature]],
lowest_value_song = min_row$track_name,
lowest_value_artist = min_row$track_artist,
lowest_value = min_row[[feature]]
)
return(result)
}
features <- c("energy", "tempo", "danceability", "loudness", "liveness",
"valence", "speechiness", "instrumentalness", "duration_ms",
"acousticness")
extreme_values <- lapply(features, find_extremes) %>%
bind_rows()
# Print the result
print(extreme_values)
## feature
## 1 energy
## 2 tempo
## 3 danceability
## 4 loudness
## 5 liveness
## 6 valence
## 7 speechiness
## 8 instrumentalness
## 9 duration_ms
## 10 acousticness
## highest_value_song
## 1 Hard Beat
## 2 Lo-fi Love Letters
## 3 Ice Ice Baby
## 4 REI DO BRASIL
## 5 Besame Mucho
## 6 Stop
## 7 Ucingo
## 8 Psalm 22.21
## 9 Turn It Up / No Longer a Slave / Made a Way / Ekwueme (Medley) [Live]
## 10 Gnossienne No. 1
## highest_value_artist highest_value
## 1 TNT, Darren Styles, Technoboy, Tuneboy 0.998
## 2 Idris Kelly 241.426
## 3 Vanilla Ice 0.979
## 4 Seek 1.318
## 5 Dave Brubeck 0.979
## 6 B.W.H. 0.987
## 7 Zee Nxumalo, Sly, GL_Ceejay, Kabza De Small, Shakes & Les 0.927
## 8 jung jaeil, VOCES8 0.991
## 9 Big Bolaji 1355260.000
## 10 Erik Satie, Alena Cherny 0.996
## lowest_value_song lowest_value_artist lowest_value
## 1 Foundation Matheo Lyon 2.0200e-04
## 2 Deep Peter Sandberg 4.8232e+01
## 3 Rosy Misha Burton 5.8900e-02
## 4 Oscalated Setareha -4.8069e+01
## 5 Ain't It Fun Paramore 2.1000e-02
## 6 Somnova Reso Nata 2.9600e-02
## 7 難得有情人 Shirley Kwan 2.1900e-02
## 8 Die With A Smile Lady Gaga, Bruno Mars 0.0000e+00
## 9 At the Library LoFi Waiter 3.5375e+04
## 10 Star Pool c152 3.5900e-06
write.csv(extreme_values, "extreme_values.csv", row.names = FALSE)
This project allowed me to find a ton of interesting information about track popularity and songs with many different varieties of style. Running statistics on electronic and lofi music allowed me to definitevly know that electronic is more popular on average than lofi music. I learned that Billie Eilish’s music has a large IQR range. I learned how different catagories of music have large effects on energy, loudness, instrumentalness and others as well. Learning how to make graphs to find interesting meanings in the data is something I love to do, I’d love to go into data science as a job and help the world by supplying valuable information. This project was really fun. :)